#' All functions from trackr package
#' @param data dataset
#' @param sort logical. If TRUE, the final count will be sorted by the proportion of NAs
#' @param return_count logical. If FALSE, the count is printed, and the dataset is invisibly returned. If TRUE, the count itself is returned.
#'
#' @return
#' @export
#'
#' @examples

#' Count the number of NAs in each variable
count_nas <- function(data, sort = FALSE, return_count = FALSE) {
  nas <- colSums(is.na(data)) %>% tibble::enframe()
  vals <- colSums(!is.na(data)) %>% tibble::enframe()

  both <- nas %>%
    dplyr::inner_join(vals, by = "name") %>%
    dplyr::select(variable = name,
                  missing = value.x,
                  non_missing = value.y) %>%
    dplyr::mutate(prop_missing = missing / (missing + non_missing))

  if (sort) {
    both <- both %>% dplyr::arrange(-prop_missing)
  }

  both %>% print(n = Inf)

  if (!return_count) invisible(data)
  else if (return_count) both

}

#' Count in middle of a pipeline and print
#'
#' @param data dataset
#' @param ... other arguments to pass from count
#'
#' @return
#' @export
#'
#' @examples
count_print <- function(data, ...) {
  dplyr::count(data, ...) %>% print_all

  invisible(data)
}


#' Create a relative frequency table and print
#'
#' @param data dataset
#' @param ... other arguments to pass to
#' @param return_count logical. If FALSE, the count is printed, and the dataset is invisibly returned. If TRUE, the count itself is returned.
#'
#' @return
#' @export
#'
#' @examples
count_prop <- function(data, ..., return_count = FALSE) {
  count_table <- data %>% dplyr::count(...) %>%
    dplyr::mutate(prop = n / sum(n))

  if (return_count) return(count_table)
  else if (!return_count) {
    count_table %>% print_all()
    invisible(data)
  }

}

#' drop_na and see how many you drop
drop_na_track <- function(data, ...) {

  dat_dropped <- drop_na(data, ...)

  n_old <- nrow(data)
  n_new <- nrow(dat_dropped)

  tibble(
    obs_type = c("kept", "removed", "total"),
    n = c(n_new, n_old - n_new, n_old),
    prop = n / n_old
  ) %>%
    print()

  dat_dropped

}

#' Join two datasets and see how many matches there are
#'
#' @param x dataset
#' @param y dataset
#' @param by A character vector of variables to join by
#' @param suffix If there are non-joined duplicate variables in x and y, these suffixes will be added to the output to disambiguate them. Should be a character vector of length 2.
#' @param .merge If TRUE, then joined dataset will contain a variable called .merge that indicates which dataset the row came from
#' @param join_type Which type of join? (full_join, left_join, right_join, inner_join)
#' @param ... Other functions passed to the _join function
#'
#' @return
#' @export
#'
#' @examples
join_track <- function(x, y, by = NULL, suffix = c(".x", ".y"),
                      .merge = FALSE, join_type, ...) {

  # Make sure "by" reverses properly
  if (!is.null(names(by))) {
    by_cols_df <- by %>%
      tibble::enframe() %>%
      rlang::set_names(c("l", "r")) %>%
      dplyr::mutate(dplyr::across(tidyselect::everything(), haven::zap_empty)) %>%
      dplyr::mutate(l = dplyr::coalesce(l,r),
             r = dplyr::coalesce(r, l))

    by_cols_l <- by_cols_df %>% tibble::deframe()
    by_cols_r <- by_cols_df %>% dplyr::select(r, l) %>% tibble::deframe()
  } else if (is.null(names(by))) {
    by_cols_l <- by
    by_cols_r <- by
  }

  # Checking to make sure used variable names are not already in use
  if(".x_tracker" %in% names(x)){
    message("Warning: variable .x_tracker in left data was dropped")
  }
  if(".y_tracker" %in% names(y)){
    message("Warning: variable .y_tracker in right data was dropped")
  }
  if(.merge & (".merge" %in% names(x) | ".merge" %in% names(y))){
    stop("Variable .merge already exists; change name before proceeding")
  }

  # Adding simple merge tracker variables to data frames
  x[, ".x_tracker"] <- 1
  y[, ".y_tracker"] <- 1

  # Doing full join
  joined <- join_type(x, y, by = by_cols_l, suffix = suffix,  ...)

  # Calculating merge diagnoses
  matched <- joined %>%
    dplyr::filter(!is.na(.x_tracker) & !is.na(.y_tracker)) %>%
    nrow()
  x_only <- joined %>%
    dplyr::filter(!is.na(.x_tracker) & is.na(.y_tracker)) %>%
    nrow()
  y_only <- joined %>%
    dplyr::filter(is.na(.x_tracker) & !is.na(.y_tracker)) %>%
    nrow()

  counts <- tibble::tibble(merge_status = c("x_only", "y_only", "matched"),
                           n = c(x_only, y_only, matched),
                           prop = n / sum(n))

  print(counts)

  # Create .merge variable if specified
  if(.merge){
    joined <- joined %>%
      mutate(.merge =
               dplyr::case_when(
                 !is.na(.$.x_tracker) & is.na(.$.y_tracker) ~ "x_only",
                 is.na(.$.x_tracker) & !is.na(.$.y_tracker) ~ "y_only",
                 TRUE ~ "matched"
               )
      )
  }

  # Dropping tracker variables and returning data frame
  joined <- joined %>%
    dplyr::select(-.x_tracker, -.y_tracker)

  return(joined)

}






#' full_join that shows how many rows match
#'
#' @param x
#' @param y
#' @param by
#' @param suffix
#' @param .merge
#' @param ...
#'
#' @return
#' @export
#' @inheritParams join_track
#'
#' @examples
full_join_track <- function(x, y, by = NULL, suffix = c(".x", ".y"),
                            .merge = FALSE, ...) {

  join_track(x = x, y = y, by = by, suffix = suffix,
             .merge = .merge, join_type = dplyr::full_join, ...)

}

#' left_join that shows how many rows match
#'
#' @param x
#' @param y
#' @param by
#' @param suffix
#' @param .merge
#' @param ...
#'
#' @return
#' @export
#'
#' @inheritParams join_track
#' @examples
left_join_track <- function(x, y, by = NULL, suffix = c(".x", ".y"),
                            .merge = FALSE, ...) {

  join_track(x = x, y = y, by = by, suffix = suffix,
             .merge = .merge, join_type = dplyr::left_join, ...)

}

#' right_join that shows how many rows match
#'
#' @param x
#' @param y
#' @param by
#' @param suffix
#' @param .merge
#' @param ...
#'
#' @return
#' @export
#'
#' @examples
right_join_track <- function(x, y, by = NULL, suffix = c(".x", ".y"),
                            .merge = FALSE, ...) {

  join_track(x = x, y = y, by = by, suffix = suffix,
             .merge = .merge, join_type = dplyr::right_join, ...)

}

#' inner_join that shows how many rows match
#'
#' @param x
#' @param y
#' @param by
#' @param suffix
#' @param .merge
#' @param ...
#'
#' @return
#' @export
#'
#' @inheritParams join_track
#'
#' @examples
inner_join_track <- function(x, y, by = NULL, suffix = c(".x", ".y"),
                             .merge = FALSE, ...) {

  join_track(x = x, y = y, by = by, suffix = suffix,
             .merge = .merge, join_type = dplyr::inner_join, ...)

}

# d1 <- tibble(x1 = sample(1:100, 100, replace = TRUE),
#              y = sample(1:100, 100, replace = TRUE))
#
# d2 <- tibble(x2 = sample(1:100, 200, replace = TRUE),
#              y = sample(1:100, 200, replace = TRUE),
#              z = sample(1:100, 200, replace = TRUE))
#
# right_join_track(d1, d2, by = c("x1" = "x2"), .merge = TRUE)

#
# df <- tibble(
#   x = sample(1:100, 40, replace = TRUE),
#   y = sample(1:100, 40, replace = TRUE)
# )
#
# edited_vars <- df %>% dplyr:::mutate_cols(x = if_else(y > 40, 0L, x)) %>% names()
#
# new_df <- df %>%
#   mutate(x = if_else(y > 40, 0L, x)) %>%
#   mutate(x = if_else(y < 30, NA_integer_, x))
#
# harsh_equal <- function(x, y) {
#   x == y | (is.na(x) & !is.na(y)) | (!is.na(x) & is.na(y))
# }
#
# for (var in edited_vars) {
#   old_x <- df[[var]]      # this will return null if it doesn't exist in old df
#   new_x <- new_df[[var]]
#
#   n_changes <- sum(harsh_equal(old_x, new_x))
#   n_new_nas <- sum(!is.na(df$x) & is.na(new_df$x))
#
#   if (is.null(old_x)) {
#     print(paste0(var, " is a new variable"))
#   } else {
#     print(paste0(var, ": ", n_changes, " changes were made"))
#     print(paste0(var, ": ", n_new_nas, " rows were changed to NA"))
#   }
# }
#
# df[["z"]]
# n_changes <- sum(harsh_equal(df$x, new_df$x))
# n_new_nas <- sum(!is.na(df$x) & is.na(new_df$x))
#
#
# ?mutate
# mutate
# mutate.data.frame
# getS3method("mutate", "data.frame")
# dplyr:::mutate_cols



#' Track what changes are made when you mutate
#'
#' @param .data dataset
#' @param ... arguments to pass to mutate
#'
#' @return
#' @export
#'
#' @examples
mutate_track <- function(.data, ...) {

  new_df <- mutate(.data, ...)

  edited_vars <- dplyr:::mutate_cols(.data, dplyr:::dplyr_quosures(...), caller_env = rlang::caller_env()) %>% names()

  harsh_equal <- function(x, y) {
    st_equal <- x == y
    both_nas <- is.na(x) & is.na(y)
    dplyr::if_else(is.na(st_equal), both_nas, st_equal)
  }

  for (var in edited_vars) {
    old_x <- .data[[var]]     # this will return null if it doesn't exist in old df
    new_x <- new_df[[var]]

    n_changes <- sum(!harsh_equal(old_x, new_x))
    n_new_nas <- sum(!is.na(old_x) & is.na(new_x))

    if (is.null(old_x)) {
      print(paste0(var, " is a new variable"))
    } else {
      print_string <- paste0(var, ": ", n_changes, " changes made")
      if (n_new_nas > 0) print_string <- paste0(print_string, ", ", n_new_nas, " rows changed to NA")

      print(print_string)
    }
  }

  return(new_df)

}

# tibble(
#   a = sample(1:100, 100, replace = TRUE),
#   b = sample(1:100, 100, replace = TRUE)
# ) %>%
#   mutate_track(a = ifelse(b > 50, 20, a)) %>%
#   mutate_track(a = ifelse(b < 20, NA, a)) %>%
#   mutate_track(b = a)

#' Prints all rows of a tibble
#'
#' @param data
#'
#' @return
#' @export
#'
#' @examples
print_all <- function(data) {
  data %>% tibble::as_tibble() %>%
    print(n = Inf)
}

#' Last non-NA in a vector
#' Calculate the last value in a vector that's not an NA, otherwise return NA
#'
#' @param x vector
#'
#' @return
#' @export
#'
#' @examples
last_non_na <- function(x) {
  x_no_na <- x[!is.na(x)]
  x_type <- typeof(x)
  x_class <- class(x)
  x_attr <- attributes(x)

  if (length(x_no_na) > 0) dplyr::last(x_no_na)

  else if (length(x_no_na) == 0) {

    if (x_type == "character") out <- NA_character_
    # else if (x_class == "Date") NA_Date_
    # else if (x_class == "factor") NA_factor_
    else if (x_type == "logical") out <- NA
    else if (x_type == "double") out <- NA_real_
    else if (x_type == "integer") out <- NA_integer_

    # Inherit class and attributes (useful for factors and dates)
    class(out) <- x_class
    attributes(out) <- x_attr

    return(out)

  }

}

#' First non-NA in a vector
#' Calculate the first value in a vector that's not an NA, otherwise return NA
#'
#' @param x vector
#'
#' @return
#' @export
#'
#' @examples
first_non_na <- function(x) {
  x_no_na <- x[!is.na(x)]
  x_type <- typeof(x)
  x_class <- class(x)
  x_attr <- attributes(x)

  if (length(x_no_na) > 0) dplyr::first(x_no_na)
  else if (length(x_no_na) == 0) {

    if (x_type == "character") out <- NA_character_
    # else if (x_class == "Date") NA_Date_
    # else if (x_class == "factor") NA_factor_
    else if (x_type == "logical") out <- NA
    else if (x_type == "double") out <- NA_real_
    else if (x_type == "integer") out <- NA_integer_

    # Inherit class and attributes (useful for factors and dates)
    class(out) <- x_class
    attributes(out) <- x_attr

    return(out)

  }
}

#' Calculate max of a vector or return NA if there are no non-missing values to calculate from
#' Useful for summarising
#'
#' @param ...
#'
#' @return
#' @export
#'
#' @examples
max_na <- function(...) {
  suppressWarnings(max_val <- max(..., na.rm = T))
  max_val[is.infinite(max_val)] <- NA_real_
  max_val
}

#' Calculate max of a vector or return NA if there are no non-missing values to calculate from
#' Useful for summarising
#'
#' @param ...
#'
#' @return
#' @export
#'
#' @examples
min_na <- function(...) {
  suppressWarnings(min_val <- min(..., na.rm = T))
  min_val[is.infinite(min_val)] <- NA_real_
  min_val
}

#' Calculate mean of a vector or return NA if there are no non-missing values to calculate from
#' Useful for summarising
#'
#' @param x vector
#'
#' @return
#' @export
#'
#' @examples
sum_na <- function(x) {
  l <- length(x[!is.na(x)])
  if (l > 0) sum(x, na.rm = T)
  else if (l == 0) NA
  else stop("something's wrong")
}

#' Calculate mean of a vector or return NA if there are no non-missing values to calculate from
#' Useful for summarising
#'
#' @param x vector
#'
#' @return
#' @export
#'
#' @examples
mean_na <- function(x) {
  l <- length(x[!is.na(x)])
  if (l > 0) mean(x, na.rm = T)
  else if (l == 0) NA
  else stop("something's wrong")
}

#' Calculate median of a vector or return NA if there are no non-missing values to calculate from
#' Useful for summarising
#'
#' @param x vector
#'
#' @return
#' @export
#'
#' @examples
median_na <- function(x) {
  l <- length(x[!is.na(x)])
  if (l > 0) median(x, na.rm = T)
  else if (l == 0) NA
  else stop("something's wrong")
}


#' Replace with NA if certain value
#'
#' @param x vector
#' @param values values to replace with NA
#'
#' @return
#' @export
#'
#' @examples
replace_with_na <- function(x, values) {
  ifelse(x %in% values, NA, x)
}


#' Function to quickly see all columns in a df
#'
#' @param x data
#' @param
#'
#' @return
#' @export
#'
#' @examples
print_names <- function(x) {
  x %>% names %>% enframe %>% print_all
  invisible(x)
}



#' Function to check which variables differ across two rows
#'
#' @param tibble_data
#' @param k
#'
#' @return
#' @export
#'
#' @examples
check_row_diff <- function(tibble_data, k = 1) {
  # Check that k is valid
  if (k <= 0 || k >= nrow(tibble_data)) {
    stop("k should be a positive integer less than the number of rows in the tibble")
  }

  # Extract rows k and k+1
  row_k <- tibble_data[k, ]
  row_k_plus_1 <- tibble_data[k+1, ]

  # Compare the two rows
  differing_vars <- names(row_k)[which(row_k != row_k_plus_1)]

  # Output a tibble with the differences
  if (length(differing_vars) == 0) {
    message("No differences between row ", k, " and row ", k + 1)
  } else {
    differences_tibble <- tibble(
      variable = differing_vars,
      value_row_k = unlist(row_k[differing_vars]),
      value_row_k_plus_1 = unlist(row_k_plus_1[differing_vars])
    )
    print(differences_tibble)
  }
}

#' Sample n groups in a grouped dataset
#'
#' @param data
#' @param n numeric, how many groups to sample
#'
#' @return
#' @export
#'
#' @examples
sample_n_groups <- function(data, n) {

  grouped <- dplyr::is_grouped_df(data)

  if (!grouped) {
    sampled_df <- dplyr::sample_n(data, n)
  } else if (grouped) {

    group_var <- dplyr::group_vars(data)
    warning(paste0("Showing ", n, " groups by ", paste0(group_var, collapse = ", ")))

    # Get N groups
    data_ids <- data %>%
      dplyr::ungroup() %>%
      dplyr::select(all_of(group_var)) %>%
      dplyr::distinct() %>%
      dplyr::sample_n(n)

    # Get dataaset with all info on those N groups
    sampled_df <- data %>%
      dplyr::semi_join(data_ids, by = group_var)

  }

  return(sampled_df)

}








#' Randomly view N rows
#'
#' @param data data.frame or tibble
#' @param n How many rows (or groups) to view
#' @param by_group If TRUE, then view n randomly selected groups, if false then view n randomly selected rows
#'
#' @return
#' @export
#'
#' @examples
view_n <- function(data, n = 200, by_group = TRUE) {

  # For testing
  # x <- d; by_group = T; n = 200
  # x <- exp_harvest_weather %>% group_by(hhid_unique)

  # if (round) x <- x %>%
  #     mutate(across(where(is.numeric),
  #                   ~ signif(., 3)))

  grouped <- dplyr::is_grouped_df(data)

  if (!grouped | !by_group) {

    show_all <- nrow(data) <= n

    if (show_all) {
      n_updated <- nrow(data)
    } else if (!show_all) {
      n_updated <- n
    }

    if (!by_group & grouped) {
      warning("Not showing by groups")
    }

    tibble::view(sample_n(data, n_updated))

  } else if (grouped & by_group) {

    # # mean_group_size <- mean(group_size(x))
    # group_var <- group_vars(x)
    #
    # warning(paste0("Showing ", n, " groups by ", paste0(group_var, collapse = ", ")))
    #
    # # n_group_guess <- ceiling(n / mean_group_size)
    #
    # # Get N groups
    # x_ids <- x %>%
    #   ungroup() %>%
    #   select(all_of(group_var)) %>%
    #   distinct() %>%
    #   sample_n(n)
    #
    # # Get dataset with all info on those N groups
    # sampled_df <- x %>%
    #   semi_join(x_ids, by = group_var)

    sampled_df <- sample_n_groups(data, n)

    tibble::view(sampled_df)

  }

  invisible(data)

}


#' View a randomly selected sample of observations after applying a filter
#'
#' @param data data.frame or tibble
#' @param ... arguments to pass to filter
#' @param n number of
#' @param by_group If TRUE, then view n randomly selected groups, if false then view n randomly selected rows
#'
#' @return
#' @export
#'
#' @examples
view_filter <- function(data, ..., n = 200, by_group = TRUE) {

  data %>%
    dplyr::filter(...) %>%
    view_n(n = n, by_group = by_group)

  invisible(data)

}

#' View a randomly selected sample of observations after selecting only some variables to view
#'
#' @param data data.frame or tibble
#' @param ... arguments to pass to select
#' @param n number of observations to view
#' @param by_group If TRUE, then view n randomly selected groups, if false then view n randomly selected rows
#'
#' @return
#' @export
#'
#' @examples
view_select <- function(data, ..., n = 200, by_group = TRUE) {

  data %>%
    dplyr::select(...) %>%
    view_n(n = n, by_group = by_group)

  invisible(data)

}